In [57]:
import graphlab
graphlab.canvas.set_target('ipynb')
%matplotlib inline
songs = graphlab.SFrame('song_data.gl')
songs[0]
In [58]:
songs[0]
Out[58]:
{'artist': 'Jack Johnson',
 'listen_count': 1,
 'song': 'The Cove - Jack Johnson',
 'song_id': 'SOAKIMP12A8C130995',
 'title': 'The Cove',
 'user_id': 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'}
In [59]:
users = songs['user_id'].unique()
train_data, test_data = songs.random_split(0.8, seed=0)
In [60]:
# simple popularity-based recommender
pop_model = graphlab.popularity_recommender.create(train_data, user_id='user_id', item_id='song')

# make recommendations using pop_model
# drawback is every person is recommended the same things
print pop_model.recommend(users=[users[0]])
Recsys training: model = popularity
Warning: Ignoring columns song_id, listen_count, title, artist;
    To use one of these as a target column, set target = 
    and use a method that allows the use of a target.
Preparing data set.
    Data has 893580 observations with 66085 users and 9952 items.
    Data prepared in: 2.04779s
893580 observations to process; with 9952 unique items.
+-------------------------------+--------------------------------+--------+------+
|            user_id            |              song              | score  | rank |
+-------------------------------+--------------------------------+--------+------+
| b048033af070b5dbb18d5d0e5f... |    Sehr kosmisch - Harmonia    | 4754.0 |  1   |
| b048033af070b5dbb18d5d0e5f... |          Undo - Björk          | 4227.0 |  2   |
| b048033af070b5dbb18d5d0e5f... | You're The One - Dwight Yoakam | 3781.0 |  3   |
| b048033af070b5dbb18d5d0e5f... |    Revelry - Kings Of Leon     | 3527.0 |  4   |
| b048033af070b5dbb18d5d0e5f... | Horn Concerto No. 4 in E f...  | 3161.0 |  5   |
| b048033af070b5dbb18d5d0e5f... |     Secrets - OneRepublic      | 3148.0 |  6   |
| b048033af070b5dbb18d5d0e5f... |    Hey_ Soul Sister - Train    | 2538.0 |  7   |
| b048033af070b5dbb18d5d0e5f... | Fireflies - Charttraxx Karaoke | 2532.0 |  8   |
| b048033af070b5dbb18d5d0e5f... |       Tive Sim - Cartola       | 2521.0 |  9   |
| b048033af070b5dbb18d5d0e5f... | Drop The World - Lil Wayne...  | 2053.0 |  10  |
+-------------------------------+--------------------------------+--------+------+
[10 rows x 4 columns]

In [61]:
# personalized song recommender
per_model = graphlab.item_similarity_recommender.create(train_data, user_id='user_id', item_id='song')
Recsys training: model = item_similarity
Warning: Ignoring columns song_id, listen_count, title, artist;
    To use one of these as a target column, set target = 
    and use a method that allows the use of a target.
Preparing data set.
    Data has 893580 observations with 66085 users and 9952 items.
    Data prepared in: 2.06415s
Training model from provided data.
Gathering per-item and per-user statistics.
+--------------------------------+------------+
| Elapsed Time (Item Statistics) | % Complete |
+--------------------------------+------------+
| 5.57ms                         | 1.5        |
| 126.759ms                      | 100        |
+--------------------------------+------------+
Setting up lookup tables.
Processing data in one pass using dense lookup tables.
+-------------------------------------+------------------+-----------------+
| Elapsed Time (Constructing Lookups) | Total % Complete | Items Processed |
+-------------------------------------+------------------+-----------------+
| 347.748ms                           | 0                | 0               |
| 2.23s                               | 100              | 9952            |
+-------------------------------------+------------------+-----------------+
Finalizing lookup tables.
Generating candidate set for working with new users.
Finished training in 2.41476s
In [62]:
# make recommendations using per_model
print per_model.recommend(users=[users[0]])
+-------------------------------+--------------------------------+
|            user_id            |              song              |
+-------------------------------+--------------------------------+
| b048033af070b5dbb18d5d0e5f... |        Fantasy - The xx        |
| b048033af070b5dbb18d5d0e5f... | Walk In The Park - Beach House |
| b048033af070b5dbb18d5d0e5f... |  Lover Of Mine - Beach House   |
| b048033af070b5dbb18d5d0e5f... |    Used To Be - Beach House    |
| b048033af070b5dbb18d5d0e5f... |      Norway - Beach House      |
| b048033af070b5dbb18d5d0e5f... |    Take Care - Beach House     |
| b048033af070b5dbb18d5d0e5f... |   Paper Gangsta - Lady GaGa    |
| b048033af070b5dbb18d5d0e5f... |    Real Love - Beach House     |
| b048033af070b5dbb18d5d0e5f... | Beautiful_ Dirty_ Rich - L...  |
| b048033af070b5dbb18d5d0e5f... |   Boys Boys Boys - Lady GaGa   |
+-------------------------------+--------------------------------+
+-----------------+------+
|      score      | rank |
+-----------------+------+
| 0.0377200168112 |  1   |
| 0.0297601197077 |  2   |
| 0.0273156917613 |  3   |
| 0.0258316345837 |  4   |
| 0.0219293936439 |  5   |
| 0.0215793044671 |  6   |
| 0.0197489676268 |  7   |
| 0.0195482217747 |  8   |
| 0.0191167100616 |  9   |
| 0.0190747302511 |  10  |
+-----------------+------+
[10 rows x 4 columns]

In [63]:
print per_model.get_similar_items(['With Or Without You - U2'])
+--------------------------+-------------------------------+-----------------+
|           song           |            similar            |      score      |
+--------------------------+-------------------------------+-----------------+
| With Or Without You - U2 | I Still Haven't Found What... |  0.042857170105 |
| With Or Without You - U2 | Hold Me_ Thrill Me_ Kiss M... | 0.0337349176407 |
| With Or Without You - U2 |    Window In The Skies - U2   | 0.0328358411789 |
| With Or Without You - U2 |          Vertigo - U2         | 0.0300751924515 |
| With Or Without You - U2 |   Sunday Bloody Sunday - U2   | 0.0271317958832 |
| With Or Without You - U2 |            Bad - U2           | 0.0251798629761 |
| With Or Without You - U2 |     A Day Without Me - U2     | 0.0237154364586 |
| With Or Without You - U2 | Another Time Another Place... | 0.0203251838684 |
| With Or Without You - U2 |          Walk On - U2         | 0.0202020406723 |
| With Or Without You - U2 |     Get On Your Boots - U2    | 0.0196850299835 |
+--------------------------+-------------------------------+-----------------+
+------+
| rank |
+------+
|  1   |
|  2   |
|  3   |
|  4   |
|  5   |
|  6   |
|  7   |
|  8   |
|  9   |
|  10  |
+------+
[10 rows x 4 columns]

In [64]:
# quantitative comparison between the models
%matplotlib inline
model_performance = graphlab.compare(test_data, [pop_model, per_model], user_sample=0.05)
compare_models: using 2931 users to estimate model performance
PROGRESS: Evaluate model M0
recommendations finished on 1000/2931 queries. users per second: 736.174
recommendations finished on 2000/2931 queries. users per second: 720.852

Precision and recall summary statistics by cutoff
recommendations finished on 1000/2931 queries. users per second: 722.834
recommendations finished on 2000/2931 queries. users per second: 716.251
+--------+-----------------+------------------+
| cutoff |  mean_precision |   mean_recall    |
+--------+-----------------+------------------+
|   1    | 0.0327533265097 | 0.00904239056747 |
|   2    | 0.0308768338451 | 0.0170281801295  |
|   3    | 0.0268395314455 | 0.0216805920491  |
|   4    | 0.0239679290345 | 0.0259256059154  |
|   5    | 0.0218355510065 | 0.0295027454444  |
|   6    | 0.0208120095531 | 0.0347657251956  |
|   7    | 0.0195447677536 | 0.0383486864551  |
|   8    | 0.0190208120096 | 0.0420459284072  |
|   9    | 0.0180067477918 | 0.0444959083905  |
|   10   | 0.0175025588536 | 0.0474526828416  |
+--------+-----------------+------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1

Precision and recall summary statistics by cutoff
+--------+-----------------+-----------------+
| cutoff |  mean_precision |   mean_recall   |
+--------+-----------------+-----------------+
|   1    |  0.19208461276  |  0.060806394788 |
|   2    |  0.15933128625  | 0.0962222375221 |
|   3    |  0.139087910838 |  0.123158924362 |
|   4    |  0.122569089048 |  0.143631714205 |
|   5    |  0.110064824292 |  0.159006370444 |
|   6    |  0.10161492096  |  0.175907571711 |
|   7    | 0.0951406150997 |  0.190366489343 |
|   8    | 0.0896451722961 |  0.203983680749 |
|   9    | 0.0841199438948 |  0.214282990409 |
|   10   | 0.0794268167861 |  0.223303755648 |
+--------+-----------------+-----------------+
[10 rows x 3 columns]

Model compare metric: precision_recall
In [65]:
# personalized recommender is so much better than popularity recommender
graphlab.show_comparison(model_performance,[pop_model, per_model]) 
In [66]:
# assignment : result 1
print len(songs[songs['artist'] == 'Kanye West']['user_id'].unique())
print len(songs[songs['artist'] == 'Foo Fighters']['user_id'].unique())
print len(songs[songs['artist'] == 'Taylor Swift']['user_id'].unique())
print len(songs[songs['artist'] == 'Lady GaGa']['user_id'].unique())
2522
2055
3246
2928
In [67]:
# assignment : result 2
data = songs.groupby(key_columns='artist', operations={'total_count': graphlab.aggregate.SUM('listen_count')}).sort('total_count', ascending=False)
print data[0]
print data[-1]
{'total_count': 43218, 'artist': 'Kings Of Leon'}
{'total_count': 14, 'artist': 'William Tabbert'}
In [68]:
# assignment : result 3
subset_test_users = test_data['user_id'].unique()[0:10000]
In [69]:
data = per_model.recommend(subset_test_users,k=1).groupby(key_columns='song', operations={'count': graphlab.aggregate.COUNT()}).sort('count', ascending=False)
recommendations finished on 1000/10000 queries. users per second: 748.629
recommendations finished on 2000/10000 queries. users per second: 748.122
recommendations finished on 3000/10000 queries. users per second: 762.527
recommendations finished on 4000/10000 queries. users per second: 773.649
recommendations finished on 5000/10000 queries. users per second: 772.028
recommendations finished on 6000/10000 queries. users per second: 774.806
recommendations finished on 7000/10000 queries. users per second: 775.544
recommendations finished on 8000/10000 queries. users per second: 777.584
recommendations finished on 9000/10000 queries. users per second: 775.214
recommendations finished on 10000/10000 queries. users per second: 775.039
In [70]:
print data
+--------------------------------+-------+
|              song              | count |
+--------------------------------+-------+
|          Undo - Björk          |  453  |
|     Secrets - OneRepublic      |  375  |
|    Revelry - Kings Of Leon     |  227  |
| You're The One - Dwight Yoakam |  166  |
| Fireflies - Charttraxx Karaoke |  127  |
|    Hey_ Soul Sister - Train    |  107  |
| Horn Concerto No. 4 in E f...  |   96  |
|    Sehr kosmisch - Harmonia    |   84  |
| OMG - Usher featuring will...  |   71  |
|    U Smile - Justin Bieber     |   48  |
+--------------------------------+-------+
[3121 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.